From cc113283a5f5c82656872f3f4417d258d138df4b Mon Sep 17 00:00:00 2001 From: "kaf24@labyrinth.cl.cam.ac.uk" Date: Wed, 25 Aug 2004 15:40:15 +0000 Subject: [PATCH] bitkeeper revision 1.1159.51.2 (412cb2dfaIDYjySJYYMTByGbcM77UA) More grant-table code, and some related sundry improvements. --- xen/arch/x86/domain.c | 4 +- xen/arch/x86/memory.c | 108 ++++++++++++++++++++++------------ xen/arch/x86/setup.c | 2 +- xen/common/domain.c | 6 +- xen/common/grant_table.c | 107 +++++++++++++++++++++------------ xen/common/kernel.c | 12 +++- xen/common/page_alloc.c | 38 ++++++++---- xen/include/asm-x86/atomic.h | 63 +++++++++++--------- xen/include/asm-x86/mm.h | 54 ++++++++--------- xen/include/asm-x86/smp.h | 19 ------ xen/include/asm-x86/system.h | 20 +++---- xen/include/xen/grant_table.h | 17 +++++- xen/include/xen/sched.h | 27 +++++---- 13 files changed, 286 insertions(+), 191 deletions(-) diff --git a/xen/arch/x86/domain.c b/xen/arch/x86/domain.c index 87aa127781..8739506b73 100644 --- a/xen/arch/x86/domain.c +++ b/xen/arch/x86/domain.c @@ -668,9 +668,9 @@ int construct_dom0(struct domain *p, mfn++ ) { page = &frame_table[mfn]; - page->u.inuse.domain = p; + page->u.inuse.domain = p; page->u.inuse.type_info = 0; - page->u.inuse.count_info = PGC_allocated | 1; + page->u.inuse.count_info = PGC_always_set | PGC_allocated | 1; list_add_tail(&page->list, &p->page_list); p->tot_pages++; p->max_pages++; } diff --git a/xen/arch/x86/memory.c b/xen/arch/x86/memory.c index 9f8bca4603..52c9dcca8d 100644 --- a/xen/arch/x86/memory.c +++ b/xen/arch/x86/memory.c @@ -153,6 +153,9 @@ void arch_init_memory(void) vm_assist_info[VMASST_TYPE_writable_pagetables].disable = ptwr_disable; + for ( mfn = 0; mfn < max_page; mfn++ ) + frame_table[mfn].u.inuse.count_info |= PGC_always_set; + /* Initialise to a magic of 0x55555555 so easier to spot bugs later. */ memset(machine_to_phys_mapping, 0x55, 4<<20); @@ -179,9 +182,9 @@ void arch_init_memory(void) mfn < virt_to_phys(&machine_to_phys_mapping[1<<20])>>PAGE_SHIFT; mfn++ ) { - frame_table[mfn].u.inuse.count_info = 1 | PGC_allocated; - frame_table[mfn].u.inuse.type_info = 1 | PGT_gdt_page; /* non-RW */ - frame_table[mfn].u.inuse.domain = dom_xen; + frame_table[mfn].u.inuse.count_info |= PGC_allocated | 1; + frame_table[mfn].u.inuse.type_info = PGT_gdt_page | 1; /* non-RW */ + frame_table[mfn].u.inuse.domain = dom_xen; } } @@ -370,6 +373,7 @@ get_page_from_l1e( { unsigned long l1v = l1_pgentry_val(l1e); unsigned long pfn = l1_pgentry_to_pagenr(l1e); + struct pfn_info *page = &frame_table[pfn]; extern int domain_iomem_in_pfn(struct domain *d, unsigned long pfn); if ( !(l1v & _PAGE_PRESENT) ) @@ -383,6 +387,8 @@ get_page_from_l1e( if ( unlikely(!pfn_is_ram(pfn)) ) { + /* SPECIAL CASE 1. Mapping an I/O page. */ + /* Revert to caller privileges if FD == DOMID_IO. */ if ( d == dom_io ) d = current; @@ -397,17 +403,41 @@ get_page_from_l1e( return 0; } + if ( unlikely(!get_page_from_pagenr(pfn, d)) ) + { + /* SPECIAL CASE 2. Mapping a foreign page via a grant table. */ + + int rc; + struct domain *e; + u32 count_info; + /* + * Yuk! Amazingly this is the simplest way to get a guaranteed atomic + * snapshot of a 64-bit value on IA32. x86/64 solves this of course! + * Basically it's a no-op CMPXCHG, to get us the current contents. + * No need for LOCK prefix -- we know that count_info is never zero + * because it contains PGC_always_set. + */ + __asm__ __volatile__( + "cmpxchg8b %2" + : "=a" (e), "=d" (count_info), + "=m" (*(volatile u64 *)(&page->u.inuse.domain)) + : "0" (0), "1" (0), "b" (0), "c" (0) ); + if ( unlikely((count_info & PGC_count_mask) == 0) || + unlikely(e == NULL) || unlikely(!get_domain(e)) ) + return 0; + rc = gnttab_try_map(e, d, page, l1v & _PAGE_RW); + put_domain(e); + return rc; + } + if ( l1v & _PAGE_RW ) { - if ( unlikely(!get_page_and_type_from_pagenr( - pfn, PGT_writable_page, d)) ) + if ( unlikely(!get_page_type(page, PGT_writable_page)) ) return 0; - set_bit(_PGC_tlb_flush_on_type_change, - &frame_table[pfn].u.inuse.count_info); - return 1; + set_bit(_PGC_tlb_flush_on_type_change, &page->u.inuse.count_info); } - return get_page_from_pagenr(pfn, d); + return 1; } @@ -434,14 +464,33 @@ get_page_from_l2e( } -static void put_page_from_l1e(l1_pgentry_t l1e) +static void put_page_from_l1e(l1_pgentry_t l1e, struct domain *d) { struct pfn_info *page = &frame_table[l1_pgentry_to_pagenr(l1e)]; unsigned long l1v = l1_pgentry_val(l1e); + struct domain *e = page->u.inuse.domain; if ( !(l1v & _PAGE_PRESENT) || !pfn_is_ram(l1v >> PAGE_SHIFT) ) return; + if ( unlikely(e != d) ) + { + /* + * Unmap a foreign page that may have been mapped via a grant table. + * Note that this can fail for a privileged domain that can map foreign + * pages via MMUEXT_SET_FOREIGNDOM. Such domains can have some mappings + * counted via a grant entry and some counted directly in the page + * structure's reference count. Note that reference counts won't get + * dangerously confused as long as we always try to decrement the + * grant entry first. We may end up with a mismatch between which + * mappings and which unmappings are counted via the grant entry, but + * really it doesn't matter as privileged domains have carte blanche. + */ + if ( likely(gnttab_try_unmap(e, d, page, l1v & _PAGE_RW)) ) + return; + /* Assume this mapping was made via MMUEXT_SET_FOREIGNDOM... */ + } + if ( l1v & _PAGE_RW ) { put_page_and_type(page); @@ -452,7 +501,7 @@ static void put_page_from_l1e(l1_pgentry_t l1e) if ( unlikely(((page->u.inuse.type_info & PGT_type_mask) == PGT_ldt_page)) && unlikely(((page->u.inuse.type_info & PGT_count_mask) != 0)) ) - invalidate_shadow_ldt(page->u.inuse.domain); + invalidate_shadow_ldt(e); put_page(page); } } @@ -527,7 +576,7 @@ static int alloc_l1_table(struct pfn_info *page) fail: while ( i-- > 0 ) - put_page_from_l1e(pl1e[i]); + put_page_from_l1e(pl1e[i], d); unmap_domain_mem(pl1e); return 0; @@ -551,6 +600,7 @@ static void free_l2_table(struct pfn_info *page) static void free_l1_table(struct pfn_info *page) { + struct domain *d = page->u.inuse.domain; unsigned long page_nr = page - frame_table; l1_pgentry_t *pl1e; int i; @@ -558,7 +608,7 @@ static void free_l1_table(struct pfn_info *page) pl1e = map_domain_mem(page_nr << PAGE_SHIFT); for ( i = 0; i < ENTRIES_PER_L1_PAGETABLE; i++ ) - put_page_from_l1e(pl1e[i]); + put_page_from_l1e(pl1e[i], d); unmap_domain_mem(pl1e); } @@ -651,6 +701,7 @@ static int mod_l1_entry(l1_pgentry_t *pl1e, l1_pgentry_t nl1e) { l1_pgentry_t ol1e; unsigned long _ol1e; + struct domain *d = current; if ( unlikely(__get_user(_ol1e, (unsigned long *)pl1e) != 0) ) { @@ -671,18 +722,18 @@ static int mod_l1_entry(l1_pgentry_t *pl1e, l1_pgentry_t nl1e) if ( unlikely(!update_l1e(pl1e, ol1e, nl1e)) ) { - put_page_from_l1e(nl1e); + put_page_from_l1e(nl1e, d); return 0; } - put_page_from_l1e(ol1e); + put_page_from_l1e(ol1e, d); return 1; } if ( unlikely(!update_l1e(pl1e, ol1e, nl1e)) ) return 0; - put_page_from_l1e(ol1e); + put_page_from_l1e(ol1e, d); return 1; } @@ -1289,20 +1340,10 @@ int do_update_va_mapping_otherdomain(unsigned long page_nr, } -static inline int readonly_page_from_l1e(l1_pgentry_t l1e) -{ - struct pfn_info *page = &frame_table[l1_pgentry_to_pagenr(l1e)]; - unsigned long l1v = l1_pgentry_val(l1e); - - if ( (l1v & _PAGE_RW) || !(l1v & _PAGE_PRESENT) || - !pfn_is_ram(l1v >> PAGE_SHIFT) ) - return 0; - put_page_type(page); - return 1; -} - -/* Writable Pagetables */ +/************************* + * Writable Pagetables + */ ptwr_info_t ptwr_info[NR_CPUS] = { [ 0 ... NR_CPUS-1 ] = @@ -1365,13 +1406,8 @@ void ptwr_reconnect_disconnected(unsigned long addr) nl1e = pl1e[i]; if (likely(l1_pgentry_val(nl1e) == l1_pgentry_val(ol1e))) continue; - if (likely((l1_pgentry_val(nl1e) ^ l1_pgentry_val(ol1e)) == - _PAGE_RW)) { - if (likely(readonly_page_from_l1e(nl1e))) - continue; - } if (unlikely(l1_pgentry_val(ol1e) & _PAGE_PRESENT)) - put_page_from_l1e(ol1e); + put_page_from_l1e(ol1e, current); if (unlikely(!get_page_from_l1e(nl1e, current))) BUG(); } @@ -1438,7 +1474,7 @@ void ptwr_flush_inactive(void) if (likely(l1_pgentry_val(ol1e) == l1_pgentry_val(nl1e))) continue; if (unlikely(l1_pgentry_val(ol1e) & _PAGE_PRESENT)) - put_page_from_l1e(ol1e); + put_page_from_l1e(ol1e, current); if (unlikely(!get_page_from_l1e(nl1e, current))) BUG(); } diff --git a/xen/arch/x86/setup.c b/xen/arch/x86/setup.c index 3d18ebd4ee..975f8a4724 100644 --- a/xen/arch/x86/setup.c +++ b/xen/arch/x86/setup.c @@ -411,7 +411,7 @@ void __init start_of_day(void) clear_bit(smp_processor_id(), &wait_init_idle); smp_threads_ready = 1; smp_commence(); /* Tell other CPUs that state of the world is stable. */ - while (wait_init_idle) + while ( wait_init_idle != 0 ) { cpu_relax(); barrier(); diff --git a/xen/common/domain.c b/xen/common/domain.c index 7682381032..55621847d0 100644 --- a/xen/common/domain.c +++ b/xen/common/domain.c @@ -232,12 +232,16 @@ void domain_destruct(struct domain *d) { struct domain **pd; unsigned long flags; + atomic_t old, new; if ( !test_bit(DF_DYING, &d->flags) ) BUG(); /* May be already destructed, or get_domain() can race us. */ - if ( cmpxchg(&d->refcnt.counter, 0, DOMAIN_DESTRUCTED) != 0 ) + _atomic_set(old, 0); + _atomic_set(new, DOMAIN_DESTRUCTED); + old = atomic_compareandswap(old, new, &d->refcnt); + if ( _atomic_read(old) != 0 ) return; DPRINTK("Releasing task %u\n", d->domain); diff --git a/xen/common/grant_table.c b/xen/common/grant_table.c index 27f81b9e22..f76b18f8a8 100644 --- a/xen/common/grant_table.c +++ b/xen/common/grant_table.c @@ -24,6 +24,13 @@ #include #include +#define PIN_FAIL(_rc, _f, _a...) \ + do { \ + DPRINTK( _f, ## _a ); \ + rc = -(_rc); \ + goto out; \ + } while ( 0 ) + static inline void check_tlb_flush( active_grant_entry_t *a) @@ -70,6 +77,7 @@ gnttab_update_pin_status( active_grant_entry_t *act; grant_entry_t *sha; long rc = 0; + unsigned long frame; ld = current; @@ -93,8 +101,11 @@ gnttab_update_pin_status( return -EINVAL; } - if ( unlikely((rd = find_domain_by_id(dom)) == NULL) ) + if ( unlikely((rd = find_domain_by_id(dom)) == NULL) || + unlikely(ld == rd) ) { + if ( rd != NULL ) + put_domain(rd); DPRINTK("Could not find domain %d\n", dom); return -ESRCH; } @@ -102,6 +113,8 @@ gnttab_update_pin_status( act = &rd->grant_table->active[ref]; sha = &rd->grant_table->shared[ref]; + spin_lock(&rd->grant_table->lock); + if ( act->status == 0 ) { if ( unlikely(pin_flags == 0) ) @@ -118,23 +131,17 @@ gnttab_update_pin_status( if ( unlikely((sflags & GTF_type_mask) != GTF_permit_access) || unlikely(sdom != ld->domain) ) - { - DPRINTK("Bad flags (%x) or dom (%d). (NB. expected dom %d)\n", + PIN_FAIL(EINVAL, + "Bad flags (%x) or dom (%d). (NB. expected dom %d)\n", sflags, sdom, ld->domain); - rc = -EINVAL; - goto out; - } sflags |= GTF_reading; if ( !(pin_flags & GNTPIN_readonly) ) { sflags |= GTF_writing; if ( unlikely(sflags & GTF_readonly) ) - { - DPRINTK("Attempt to write-pin a read-only grant entry.\n"); - rc = -EINVAL; - goto out; - } + PIN_FAIL(EINVAL, + "Attempt to write-pin a r/o grant entry.\n"); } /* Merge two 16-bit values into a 32-bit combined update. */ @@ -144,11 +151,8 @@ gnttab_update_pin_status( /* NB. prev_sflags is updated in place to seen value. */ if ( unlikely(cmpxchg_user((u32 *)&sha->flags, prev_scombo, prev_scombo | GTF_writing)) ) - { - DPRINTK("Fault while modifying shared flags and domid.\n"); - rc = -EINVAL; - goto out; - } + PIN_FAIL(EINVAL, + "Fault while modifying shared flags and domid.\n"); /* Did the combined update work (did we see what we expected?). */ if ( prev_scombo == scombo ) @@ -161,10 +165,22 @@ gnttab_update_pin_status( } /* rmb(); */ /* not on x86 */ + frame = sha->frame; + if ( unlikely(!pfn_is_ram(frame)) || + unlikely(!((pin_flags & GNTPIN_readonly) ? + get_page(&frame_table[frame], rd) : + get_page_and_type(&frame_table[frame], rd, + PGT_writable_page))) ) + { + clear_bit(_GTF_writing, &sha->flags); + clear_bit(_GTF_reading, &sha->flags); + PIN_FAIL(EINVAL, + "Could not pin the granted frame!\n"); + } act->status = pin_flags; act->domid = sdom; - act->frame = sha->frame; + act->frame = frame; make_entry_mappable(rd->grant_table, act); } @@ -174,11 +190,13 @@ gnttab_update_pin_status( if ( unlikely((act->status & (GNTPIN_wmap_mask|GNTPIN_rmap_mask)) != 0) ) - { - DPRINTK("Attempt to deactivate a mapped g.e. (%x)\n", act->status); - rc = -EINVAL; - goto out; - } + PIN_FAIL(EINVAL, + "Attempt to deactiv a mapped g.e. (%x)\n", act->status); + + frame = act->frame; + if ( !(act->status & GNTPIN_readonly) ) + put_page_type(&frame_table[frame]); + put_page(&frame_table[frame]); act->status = 0; make_entry_unmappable(rd->grant_table, act); @@ -199,12 +217,9 @@ gnttab_update_pin_status( (unlikely((act->status & GNTPIN_wmap_mask) != 0) || (((pin_flags & GNTPIN_host_accessible) == 0) && unlikely((act->status & GNTPIN_rmap_mask) != 0))) ) - { - DPRINTK("Attempt to reduce pinning of a mapped g.e. (%x,%x)\n", + PIN_FAIL(EINVAL, + "Attempt to reduce pinning of a mapped g.e. (%x,%x)\n", pin_flags, act->status); - rc = -EINVAL; - goto out; - } /* Check for changes to host accessibility. */ if ( pin_flags & GNTPIN_host_accessible ) @@ -220,6 +235,7 @@ gnttab_update_pin_status( { if ( !(act->status & GNTPIN_readonly) ) { + put_page_type(&frame_table[act->frame]); check_tlb_flush(act); clear_bit(_GTF_writing, &sha->flags); } @@ -231,20 +247,19 @@ gnttab_update_pin_status( prev_sflags = sflags; if ( unlikely(prev_sflags & GTF_readonly) ) - { - DPRINTK("Attempt to write-pin a read-only grant entry.\n"); - rc = -EINVAL; - goto out; - } - + PIN_FAIL(EINVAL, + "Attempt to write-pin a r/o grant entry.\n"); + + if ( unlikely(!get_page_type(&frame_table[act->frame], + PGT_writable_page)) ) + PIN_FAIL(EINVAL, + "Attempt to write-pin a unwritable page.\n"); + /* NB. prev_sflags is updated in place to seen value. */ if ( unlikely(cmpxchg_user(&sha->flags, prev_sflags, prev_sflags | GTF_writing)) ) - { - DPRINTK("Fault while modifying shared flags.\n"); - rc = -EINVAL; - goto out; - } + PIN_FAIL(EINVAL, + "Fault while modifying shared flags.\n"); } while ( prev_sflags != sflags ); } @@ -261,6 +276,7 @@ gnttab_update_pin_status( (void)__put_user(act->frame, &uop->host_phys_addr); out: + spin_unlock(&rd->grant_table->lock); put_domain(rd); return rc; } @@ -289,6 +305,20 @@ do_grant_table_op( return rc; } +int +gnttab_try_map( + struct domain *rd, struct domain *ld, struct pfn_info *page, int readonly) +{ + return 0; +} + +int +gnttab_try_unmap( + struct domain *rd, struct domain *ld, struct pfn_info *page, int readonly) +{ + return 0; +} + int grant_table_create( struct domain *d) @@ -318,6 +348,7 @@ grant_table_create( SHARE_PFN_WITH_DOMAIN(virt_to_page(t->shared), d); /* Okay, install the structure. */ + wmb(); /* avoid races with lock-free access to d->grant_table */ d->grant_table = t; return 0; diff --git a/xen/common/kernel.c b/xen/common/kernel.c index 6c0775c9d2..3e37bded7d 100644 --- a/xen/common/kernel.c +++ b/xen/common/kernel.c @@ -296,9 +296,19 @@ void cmain(multiboot_info_t *mbi) xmem_cache_init(); xmem_cache_sizes_init(max_page); + /* + * Create a domain-structure allocator. The SLAB_NO_REAP flag is essential! + * This is because in some situations a domain's reference count will be + * incremented by someone with no other handle on the structure -- this is + * inherently racey because the struct could be freed by the time that the + * count is incremented. By specifying 'no-reap' we ensure that, worst + * case, they increment some other domain's count, rather than corrupting + * a random field in a random structure! + * See, for example, arch/x86/memory.c:get_page_from_l1e(). + */ domain_struct_cachep = xmem_cache_create( "domain_cache", sizeof(struct domain), - 0, SLAB_HWCACHE_ALIGN, NULL, NULL); + 0, SLAB_HWCACHE_ALIGN | SLAB_NO_REAP, NULL, NULL); if ( domain_struct_cachep == NULL ) panic("No slab cache for task structs."); diff --git a/xen/common/page_alloc.c b/xen/common/page_alloc.c index 52da9c042b..79b8df7452 100644 --- a/xen/common/page_alloc.c +++ b/xen/common/page_alloc.c @@ -300,12 +300,21 @@ void init_xenheap_pages(unsigned long ps, unsigned long pe) unsigned long alloc_xenheap_pages(int order) { struct pfn_info *pg; - int attempts = 0; + int i, attempts = 0; retry: if ( unlikely((pg = alloc_heap_pages(MEMZONE_XEN, order)) == NULL) ) goto no_memory; + memguard_unguard_range(page_to_virt(pg), 1 << (order + PAGE_SHIFT)); + + for ( i = 0; i < (1 << order); i++ ) + { + pg[i].u.inuse.count_info = PGC_always_set; + pg[i].u.inuse.domain = NULL; + pg[i].u.inuse.type_info = 0; + } + return (unsigned long)page_to_virt(pg); no_memory: @@ -343,7 +352,7 @@ struct pfn_info *alloc_domheap_pages(struct domain *d, int order) { struct pfn_info *pg; unsigned long mask, flushed_mask, pfn_stamp, cpu_stamp; - int i; + int i, j; ASSERT(!in_irq()); @@ -353,19 +362,16 @@ struct pfn_info *alloc_domheap_pages(struct domain *d, int order) flushed_mask = 0; for ( i = 0; i < (1 << order); i++ ) { - pg[i].u.inuse.domain = NULL; - pg[i].u.inuse.type_info = 0; - if ( (mask = (pg[i].u.free.cpu_mask & ~flushed_mask)) != 0 ) { pfn_stamp = pg[i].tlbflush_timestamp; - for ( i = 0; (mask != 0) && (i < smp_num_cpus); i++ ) + for ( j = 0; (mask != 0) && (j < smp_num_cpus); j++ ) { - if ( mask & (1<page_list); } @@ -418,10 +428,13 @@ void free_domheap_pages(struct pfn_info *pg, int order) if ( unlikely(IS_XEN_HEAP_FRAME(pg)) ) { spin_lock_recursive(&d->page_alloc_lock); + for ( i = 0; i < (1 << order); i++ ) list_del(&pg[i].list); + d->xenheap_pages -= 1 << order; drop_dom_ref = (d->xenheap_pages == 0); + spin_unlock_recursive(&d->page_alloc_lock); } else if ( likely(d != NULL) ) @@ -431,9 +444,8 @@ void free_domheap_pages(struct pfn_info *pg, int order) for ( i = 0; i < (1 << order); i++ ) { - pg[i].tlbflush_timestamp = tlbflush_clock; - pg[i].u.inuse.count_info = 0; - pg[i].u.free.cpu_mask = 1 << d->processor; + pg[i].tlbflush_timestamp = tlbflush_clock; + pg[i].u.free.cpu_mask = 1 << d->processor; list_del(&pg[i].list); } diff --git a/xen/include/asm-x86/atomic.h b/xen/include/asm-x86/atomic.h index b64adaedba..f2ecf955e3 100644 --- a/xen/include/asm-x86/atomic.h +++ b/xen/include/asm-x86/atomic.h @@ -2,11 +2,7 @@ #define __ARCH_X86_ATOMIC__ #include - -/* - * Atomic operations that C can't guarantee us. Useful for - * resource counting etc.. - */ +#include #ifdef CONFIG_SMP #define LOCK "lock ; " @@ -15,11 +11,11 @@ #endif /* - * Make sure gcc doesn't try to be clever and move things around - * on us. We need to use _exactly_ the address the user gave us, - * not some alias that contains the same information. + * NB. I've pushed the volatile qualifier into the operations. This allows + * fast accessors such as _atomic_read() and _atomic_set() which don't give + * the compiler a fit. */ -typedef struct { volatile int counter; } atomic_t; +typedef struct { int counter; } atomic_t; #define ATOMIC_INIT(i) { (i) } @@ -29,8 +25,9 @@ typedef struct { volatile int counter; } atomic_t; * * Atomically reads the value of @v. Note that the guaranteed * useful range of an atomic_t is only 24 bits. - */ -#define atomic_read(v) ((v)->counter) + */ +#define _atomic_read(v) ((v).counter) +#define atomic_read(v) (*(volatile int *)&((v)->counter)) /** * atomic_set - set atomic variable @@ -40,7 +37,8 @@ typedef struct { volatile int counter; } atomic_t; * Atomically sets the value of @v to @i. Note that the guaranteed * useful range of an atomic_t is only 24 bits. */ -#define atomic_set(v,i) (((v)->counter) = (i)) +#define _atomic_set(v,i) (((v).counter) = (i)) +#define atomic_set(v,i) (*(volatile int *)&((v)->counter) = (i)) /** * atomic_add - add integer to atomic variable @@ -54,8 +52,8 @@ static __inline__ void atomic_add(int i, atomic_t *v) { __asm__ __volatile__( LOCK "addl %1,%0" - :"=m" (v->counter) - :"ir" (i), "m" (v->counter)); + :"=m" (*(volatile int *)&v->counter) + :"ir" (i), "m" (*(volatile int *)&v->counter)); } /** @@ -70,8 +68,8 @@ static __inline__ void atomic_sub(int i, atomic_t *v) { __asm__ __volatile__( LOCK "subl %1,%0" - :"=m" (v->counter) - :"ir" (i), "m" (v->counter)); + :"=m" (*(volatile int *)&v->counter) + :"ir" (i), "m" (*(volatile int *)&v->counter)); } /** @@ -90,8 +88,8 @@ static __inline__ int atomic_sub_and_test(int i, atomic_t *v) __asm__ __volatile__( LOCK "subl %2,%0; sete %1" - :"=m" (v->counter), "=qm" (c) - :"ir" (i), "m" (v->counter) : "memory"); + :"=m" (*(volatile int *)&v->counter), "=qm" (c) + :"ir" (i), "m" (*(volatile int *)&v->counter) : "memory"); return c; } @@ -106,8 +104,8 @@ static __inline__ void atomic_inc(atomic_t *v) { __asm__ __volatile__( LOCK "incl %0" - :"=m" (v->counter) - :"m" (v->counter)); + :"=m" (*(volatile int *)&v->counter) + :"m" (*(volatile int *)&v->counter)); } /** @@ -121,8 +119,8 @@ static __inline__ void atomic_dec(atomic_t *v) { __asm__ __volatile__( LOCK "decl %0" - :"=m" (v->counter) - :"m" (v->counter)); + :"=m" (*(volatile int *)&v->counter) + :"m" (*(volatile int *)&v->counter)); } /** @@ -140,8 +138,8 @@ static __inline__ int atomic_dec_and_test(atomic_t *v) __asm__ __volatile__( LOCK "decl %0; sete %1" - :"=m" (v->counter), "=qm" (c) - :"m" (v->counter) : "memory"); + :"=m" (*(volatile int *)&v->counter), "=qm" (c) + :"m" (*(volatile int *)&v->counter) : "memory"); return c != 0; } @@ -160,8 +158,8 @@ static __inline__ int atomic_inc_and_test(atomic_t *v) __asm__ __volatile__( LOCK "incl %0; sete %1" - :"=m" (v->counter), "=qm" (c) - :"m" (v->counter) : "memory"); + :"=m" (*(volatile int *)&v->counter), "=qm" (c) + :"m" (*(volatile int *)&v->counter) : "memory"); return c != 0; } @@ -181,11 +179,20 @@ static __inline__ int atomic_add_negative(int i, atomic_t *v) __asm__ __volatile__( LOCK "addl %2,%0; sets %1" - :"=m" (v->counter), "=qm" (c) - :"ir" (i), "m" (v->counter) : "memory"); + :"=m" (*(volatile int *)&v->counter), "=qm" (c) + :"ir" (i), "m" (*(volatile int *)&v->counter) : "memory"); return c; } +static __inline__ atomic_t atomic_compareandswap( + atomic_t old, atomic_t new, atomic_t *v) +{ + atomic_t rc; + rc.counter = + __cmpxchg(&v->counter, old.counter, new.counter, sizeof(int)); + return rc; +} + /* Atomic operations are already serializing on x86 */ #define smp_mb__before_atomic_dec() barrier() #define smp_mb__after_atomic_dec() barrier() diff --git a/xen/include/asm-x86/mm.h b/xen/include/asm-x86/mm.h index 9a26e29f08..c07235fa5e 100644 --- a/xen/include/asm-x86/mm.h +++ b/xen/include/asm-x86/mm.h @@ -87,9 +87,11 @@ struct pfn_info /* Cleared when the owning guest 'frees' this page. */ #define _PGC_allocated 29 #define PGC_allocated (1<<_PGC_allocated) - /* 28-bit count of references to this frame. */ -#define PGC_count_mask ((1<<29)-1) - + /* This bit is always set, guaranteeing that the count word is never zero. */ +#define _PGC_always_set 28 +#define PGC_always_set (1<<_PGC_always_set) + /* 27-bit count of references to this frame. */ +#define PGC_count_mask ((1<<28)-1) /* We trust the slab allocator in slab.c, and our use of it. */ #define PageSlab(page) (1) @@ -106,7 +108,8 @@ struct pfn_info wmb(); /* install valid domain ptr before updating refcnt. */ \ spin_lock(&(_dom)->page_alloc_lock); \ /* _dom holds an allocation reference */ \ - (_pfn)->u.inuse.count_info = PGC_allocated | 1; \ + ASSERT((_pfn)->u.inuse.count_info == PGC_always_set); \ + (_pfn)->u.inuse.count_info |= PGC_allocated | 1; \ if ( unlikely((_dom)->xenheap_pages++ == 0) ) \ get_knownalive_domain(_dom); \ list_add_tail(&(_pfn)->list, &(_dom)->xenpage_list); \ @@ -150,10 +153,8 @@ static inline int get_page(struct pfn_info *page, unlikely((nx & PGC_count_mask) == 0) || /* Count overflow? */ unlikely(p != domain) ) /* Wrong owner? */ { - DPRINTK("Error pfn %08lx: ed=%p(%u), sd=%p(%u)," - " caf=%08x, taf=%08x\n", - page_to_pfn(page), domain, domain->domain, - p, (p && !((x & PGC_count_mask) == 0))?p->domain:999, + DPRINTK("Error pfn %08lx: ed=%p, sd=%p, caf=%08x, taf=%08x\n", + page_to_pfn(page), domain, p, x, page->u.inuse.type_info); return 0; } @@ -364,26 +365,21 @@ void ptwr_reconnect_disconnected(unsigned long addr); void ptwr_flush_inactive(void); int ptwr_do_page_fault(unsigned long); -static always_inline void -__cleanup_writable_pagetable( - const int what) -{ - int cpu = smp_processor_id(); - - if (what & PTWR_CLEANUP_ACTIVE) - if (ptwr_info[cpu].disconnected != ENTRIES_PER_L2_PAGETABLE) - ptwr_reconnect_disconnected(0L); - if (what & PTWR_CLEANUP_INACTIVE) - if (ptwr_info[cpu].writable_idx) - ptwr_flush_inactive(); -} - -static always_inline void -cleanup_writable_pagetable( - struct domain *d, const int what) -{ - if ( unlikely(VM_ASSIST(d, VMASST_TYPE_writable_pagetables)) ) - __cleanup_writable_pagetable(what); -} +#define __cleanup_writable_pagetable(_what) \ +do { \ + int cpu = smp_processor_id(); \ + if ((_what) & PTWR_CLEANUP_ACTIVE) \ + if (ptwr_info[cpu].disconnected != ENTRIES_PER_L2_PAGETABLE) \ + ptwr_reconnect_disconnected(0L); \ + if ((_what) & PTWR_CLEANUP_INACTIVE) \ + if (ptwr_info[cpu].writable_idx) \ + ptwr_flush_inactive(); \ +} while ( 0 ) + +#define cleanup_writable_pagetable(_d, _w) \ + do { \ + if ( unlikely(VM_ASSIST((_d), VMASST_TYPE_writable_pagetables)) ) \ + __cleanup_writable_pagetable(_w); \ + } while ( 0 ) #endif /* __ASM_X86_MM_H__ */ diff --git a/xen/include/asm-x86/smp.h b/xen/include/asm-x86/smp.h index 25c29de2e8..b4d79087c5 100644 --- a/xen/include/asm-x86/smp.h +++ b/xen/include/asm-x86/smp.h @@ -1,26 +1,13 @@ #ifndef __ASM_SMP_H #define __ASM_SMP_H -/* - * We need the APIC definitions automatically as part of 'smp.h' - */ #ifndef __ASSEMBLY__ #include -/*#include */ -#include -#endif - -#ifdef CONFIG_X86_LOCAL_APIC -#ifndef __ASSEMBLY__ #include -#include #include -#ifdef CONFIG_X86_IO_APIC #include -#endif #include #endif -#endif #ifdef CONFIG_SMP #ifndef __ASSEMBLY__ @@ -37,12 +24,6 @@ extern int pic_mode; extern int smp_num_siblings; extern int cpu_sibling_map[]; -extern void smp_flush_tlb(void); -extern void smp_message_irq(int cpl, void *dev_id, struct pt_regs *regs); -extern void smp_send_reschedule(int cpu); -extern void smp_invalidate_rcv(void); /* Process an NMI */ -extern void (*mtrr_hook) (void); - /* * On x86 all CPUs are mapped 1:1 to the APIC space. * This simplifies scheduling and IPI sending and diff --git a/xen/include/asm-x86/system.h b/xen/include/asm-x86/system.h index 4b25eec921..4835b6e236 100644 --- a/xen/include/asm-x86/system.h +++ b/xen/include/asm-x86/system.h @@ -30,33 +30,33 @@ static always_inline unsigned long __xchg(unsigned long x, volatile void * ptr, case 1: __asm__ __volatile__("xchgb %b0,%1" :"=q" (x) - :"m" (*__xg(ptr)), "0" (x) + :"m" (*__xg((volatile void *)ptr)), "0" (x) :"memory"); break; case 2: __asm__ __volatile__("xchgw %w0,%1" :"=r" (x) - :"m" (*__xg(ptr)), "0" (x) + :"m" (*__xg((volatile void *)ptr)), "0" (x) :"memory"); break; #if defined(__i386__) case 4: __asm__ __volatile__("xchgl %0,%1" :"=r" (x) - :"m" (*__xg(ptr)), "0" (x) + :"m" (*__xg((volatile void *)ptr)), "0" (x) :"memory"); break; #elif defined(__x86_64__) case 4: __asm__ __volatile__("xchgl %k0,%1" :"=r" (x) - :"m" (*__xg(ptr)), "0" (x) + :"m" (*__xg((volatile void *)ptr)), "0" (x) :"memory"); break; case 8: __asm__ __volatile__("xchgq %0,%1" :"=r" (x) - :"m" (*__xg(ptr)), "0" (x) + :"m" (*__xg((volatile void *)ptr)), "0" (x) :"memory"); break; #endif @@ -78,33 +78,33 @@ static always_inline unsigned long __cmpxchg(volatile void *ptr, unsigned long o case 1: __asm__ __volatile__(LOCK_PREFIX "cmpxchgb %b1,%2" : "=a"(prev) - : "q"(new), "m"(*__xg(ptr)), "0"(old) + : "q"(new), "m"(*__xg((volatile void *)ptr)), "0"(old) : "memory"); return prev; case 2: __asm__ __volatile__(LOCK_PREFIX "cmpxchgw %w1,%2" : "=a"(prev) - : "q"(new), "m"(*__xg(ptr)), "0"(old) + : "r"(new), "m"(*__xg((volatile void *)ptr)), "0"(old) : "memory"); return prev; #if defined(__i386__) case 4: __asm__ __volatile__(LOCK_PREFIX "cmpxchgl %1,%2" : "=a"(prev) - : "q"(new), "m"(*__xg(ptr)), "0"(old) + : "r"(new), "m"(*__xg((volatile void *)ptr)), "0"(old) : "memory"); return prev; #elif defined(__x86_64__) case 4: __asm__ __volatile__(LOCK_PREFIX "cmpxchgl %k1,%2" : "=a"(prev) - : "q"(new), "m"(*__xg(ptr)), "0"(old) + : "r"(new), "m"(*__xg((volatile void *)ptr)), "0"(old) : "memory"); return prev; case 8: __asm__ __volatile__(LOCK_PREFIX "cmpxchgq %1,%2" : "=a"(prev) - : "q"(new), "m"(*__xg(ptr)), "0"(old) + : "r"(new), "m"(*__xg((volatile void *)ptr)), "0"(old) : "memory"); return prev; #endif diff --git a/xen/include/xen/grant_table.h b/xen/include/xen/grant_table.h index 1421486410..395959323c 100644 --- a/xen/include/xen/grant_table.h +++ b/xen/include/xen/grant_table.h @@ -24,6 +24,8 @@ #ifndef __XEN_GRANT_H__ #define __XEN_GRANT_H__ +#include +#include #include /* Active grant entry - used for shadowing GTF_permit_access grants. */ @@ -65,10 +67,19 @@ typedef struct { } grant_table_t; /* Start-of-day system initialisation. */ -void grant_table_init(void); +void grant_table_init( + void); /* Create/destroy per-domain grant table context. */ -int grant_table_create(struct domain *d); -void grant_table_destroy(struct domain *d); +int grant_table_create( + struct domain *d); +void grant_table_destroy( + struct domain *d); + +/* Create/destroy host-CPU mappings via a grant-table entry. */ +int gnttab_try_map( + struct domain *rd, struct domain *ld, struct pfn_info *page, int readonly); +int gnttab_try_unmap( + struct domain *rd, struct domain *ld, struct pfn_info *page, int readonly); #endif /* __XEN_GRANT_H__ */ diff --git a/xen/include/xen/sched.h b/xen/include/xen/sched.h index 3c72f6de6a..51fb070673 100644 --- a/xen/include/xen/sched.h +++ b/xen/include/xen/sched.h @@ -1,6 +1,9 @@ #ifndef __SCHED_H__ #define __SCHED_H__ +#define STACK_SIZE (2*PAGE_SIZE) +#define MAX_DOMAIN_NAME 16 + #include #include #include @@ -10,23 +13,18 @@ #include #include #include -#include #include #include #include #include #include - -#define STACK_SIZE (2*PAGE_SIZE) #include - -#define MAX_DOMAIN_NAME 16 +#include +#include extern unsigned long volatile jiffies; extern rwlock_t tasklist_lock; -#include - struct domain; typedef struct event_channel_st @@ -167,10 +165,19 @@ struct domain *alloc_domain_struct(); * Use this when you don't have an existing reference to @d. It returns * FALSE if @d is being destructed. */ -static inline int get_domain(struct domain *d) +static always_inline int get_domain(struct domain *d) { - atomic_inc(&d->refcnt); - return !(atomic_read(&d->refcnt) & DOMAIN_DESTRUCTED); + atomic_t old, new, seen = d->refcnt; + do + { + old = seen; + if ( unlikely(_atomic_read(old) & DOMAIN_DESTRUCTED) ) + return 0; + _atomic_set(new, _atomic_read(old) + 1); + seen = atomic_compareandswap(old, new, &d->refcnt); + } + while ( unlikely(_atomic_read(seen) != _atomic_read(old)) ); + return 1; } /* -- 2.30.2